The purpose of this document is to specifically compare the VS and FC variant carriers. We ignore the WT mice here. I’m hoping that will help with interpretation.

Because the FC allele is the common allele in humans, and the VS allele is protective, all comparisons will use the FC allele as the reference.

(This workflow was copied from 1a.Differential_Expression.Rmd on October 29 at 3:23 pm.)

rm(list = ls())

library(here)


args <- commandArgs(trailingOnly=T)
comparison.type <- args[1]
age.batch <- args[2]

if(is.na(comparison.type)){
    #comparison.type = "Hom" #compares homozygous FC/FC to VS/VS
    #comparison.type = "Het" #compares homozygous WT/FC to WT/VS
    comparison.type = "Carrier" #compares all carriers of VS to carriers of FC alleles
}

if(comparison.type == "Hom"){
    vs.allele <- "VS/VS"; fc.allele = "FC/FC"
}
if(comparison.type == "Het"){
    vs.allele <- "WT/VS"; fc.allele = "WT/FC"
}
if(comparison.type == "Carrier"){
    vs.allele <- "VS"; fc.allele = "FC"
}


if(is.na(age.batch)){
    age.batch <- 4
}

alpha <- 1e-3 # alpha for significant differential expression
min.set.n <- 10 #the minimum gene set size to be considered

#subgroup <- NULL
subgroup <- list("age_batch" = as.numeric(age.batch))
#subgroup <- list("age_batch" = 12)
#subgroup <- list("age_batch" = 4, "sex_ge" = "Female") #example with more than one filter

subgroup.results <- paste(sapply(1:length(subgroup), 
function(x) paste(names(subgroup)[x], subgroup[[x]], sep = "_")), 
collapse = "_")
results.dir <- here("Results", subgroup.results)


processed.data.dir <- file.path(results.dir, "processed_data")
general.processed.data.dir <- here("Results", "Processed_Data")
general.data.dir <- here("Data", "general")

The analysis compares VS mice to FC mice. The age of the mice analyzed is 4 months.

all.fun <- list.files(here("Code"), full.names = TRUE)
for(i in 1:length(all.fun)){source(all.fun[i])}
needed.libraries <- c("synapser", "pheatmap", "DESeq2", "DT", "vioplot", "RColorBrewer",
  "gprofiler2", "cluster", "pathview", "clusterProfiler", "stringr", "igraph",
  "fgsea")
load_libraries(needed.libraries)

This table uses processed data from 1.Klotho_Initial_Data_Visualization.Rmd

mouse.genes <- read.delim(here("Data", "general", "mouse_gene_info.txt"))
info <- read.table(file.path(processed.data.dir, "mouse_info.csv"), sep = ",", header = TRUE, row.names = 1)
raw.expr <- read.table(here("Data", "Mouse", "rsem.merged.gene_counts.tsv"), header = TRUE)
norm.expr <- readRDS(file.path(processed.data.dir, "Normalized_Expression.RDS"))
scaled.expr <- readRDS(file.path(processed.data.dir, "Scaled_Expression.RDS"))

Calculate differential expression for each genotype compared to WT.

gene_t_test <- function(x, y){
    varx <- var(x)
    vary <- var(y)
    if(varx > 0 && vary > 0){
        result <- t.test(x,y)
    }else{
        result <- NA
    }
    return(result)
}


geno_de <- function(ref.allele, alt.allele, covar.table, expr.mat){
    ref.idx <- grep(ref.allele, covar.table[,"genotype"])
    alt.idx <- grep(alt.allele, covar.table[,"genotype"])
    covar.mat <- dummy_covar(covar.table[,c("sequencingBatch", "sex_ge")])
    adj.expr <- adjust(t(expr.mat), covar.mat)
    test <- apply(adj.expr, 2, function(x) gene_t_test(x[ref.idx], x[alt.idx]))
    test.info <- t(sapply(test, function(x) if(length(x) > 1){c(x$estimate, x$conf.int, x$p.value)}else{rep(NA, 5)}))
    colnames(test.info) <- c(paste0("Mean_", c(ref.allele, alt.allele)), 
        paste0("conv_int_", c(ref.allele, alt.allele)), "p")
    return(test.info)
}

QQ plots

The QQ plots for the DE p values are shown below. These two genotypes seem to have a lot of differential expression between them.

par(mfrow = c(1,2))
vs.diff.expr <- geno_de(ref.allele = fc.allele, alt.allele = vs.allele, 
    covar.table = info, expr.mat = scaled.expr)
qqunif.plot(vs.diff.expr[,"p"], plot.label = paste(fc.allele, "vs", vs.allele))
#boxplot(vs.diff.expr[,1:2])
write.table(vs.diff.expr, file.path(results.dir, 
    paste0(gsub("/", ".", fc.allele), "_vs_", gsub("/", ".", vs.allele), "_DEG.csv")), sep = ",", quote = FALSE)

Volcano Plots

Below is the volcano plots.

de <- vs.diff.expr[,2] - vs.diff.expr[,1]
sig.idx <- which(vs.diff.expr[,"p"] <= alpha)
#length(sig.idx)
sig.col <- rep("black", nrow(vs.diff.expr))
sig.col[sig.idx] <- "red"
plot(de, -log10(vs.diff.expr[,"p"]), xlab = "Effect Size", pch = 16,
    ylab = "-log10(pval)", main = paste(fc.allele, "vs.", vs.allele), col = sig.col)

FC-VS enrichment

We looked at the enrichments of what was significantly upregulated and downregulated in VS vs. FC.

sig.idx <- which(vs.diff.expr[,"p"] <= alpha)
up.in.vs <- intersect(sig.idx, which(de > 0))
down.in.vs <- intersect(sig.idx, which(de < 0))

up.enrich <- gost(rownames(vs.diff.expr)[up.in.vs], organism = "mmusculus", 
    sources = c("GO", "KEGG", "REACTOME", "HP", "CORUM"))
plot.enrichment(up.enrich, plot.label = paste("UP in", vs.allele), max.term.size = 300,
    num.terms = 30)

down.enrich <- gost(rownames(vs.diff.expr)[down.in.vs], organism = "mmusculus", 
    sources = c("GO", "KEGG", "REACTOME", "HP", "CORUM"))
plot.enrichment(down.enrich, plot.label = paste("DOWN in", vs.allele), max.term.size = 300,
    num.terms = 30)

GSEA

Instead of using a significance cutoff, we can use GSEA to look at enrichments for the genes that are up or downregulated in VS vs. FC mice.

We can use several lists for comparison: Biodomains, KEGG pathways, and the intersection between biodomains and KEGG pathways. These were generated by 1.Klotho_Initial_Data_Visualiztion.Rmd and stored in Results/Processed_Data

bd.list <- readRDS(file.path(general.processed.data.dir, "Mouse_Biodomains_for_GSEA.RDS"))
bd.go.list <- readRDS(file.path(general.processed.data.dir, "Mouse_Biodomains_sub_GO_for_GSEA.RDS"))
kegg.list <- readRDS(file.path(general.processed.data.dir, "Mouse_KEGG_for_GSEA.RDS"))
intersect.list <- readRDS(file.path(general.processed.data.dir, "Mouse_KEGG_Intersections_for_GSEA.RDS"))


run_gsea <- function(vals, pos.neg = "pos", gsea.list){
    if(pos.neg == "pos"){
        idx <- which(vals > 0)
        idx.order <- order(vals[idx], decreasing = TRUE)
        sorted.vals <- vals[idx[idx.order]]
        #plot(sorted.vals)
    }
    if(pos.neg == "neg"){
        idx <- which(vals < 0)
        idx.order <- order(abs(vals[idx]), decreasing = TRUE)
        sorted.vals <- abs(vals[idx[idx.order]])
        #plot(sorted.vals)
    }

    gsea.enrich <- fgsea::fgsea(gsea.list, sorted.vals, scoreType = "pos")
    norm.es <- as.numeric(as.matrix(gsea.enrich[,"NES"]))
    pval <- as.numeric(as.matrix(gsea.enrich[,"padj"]))
    names(norm.es) <- names(pval) <- gsea.enrich$pathway
    result <- cbind(norm.es, pval)
    return(result)
}

Biodomains

The following plots show the normalized enrichment score for each biodomain and each direction of regulation, either up in the VS allele or down in the VS allele relative to the FC allele.

enrich.types <- c("pos", "neg")
bd.gsea <- vector(mode = "list", length = length(enrich.types))
names(bd.gsea) <- enrich.types

for(tp in 1:length(enrich.types)){
    bd.gsea[[tp]] <- run_gsea(de, enrich.types[tp], bd.list)
}


par(mar = c(4,12,2,2), mfrow = c(2,1))
for(i in 1:length(enrich.types)){
    barplot(sort(bd.gsea[[i]][,1]), las = 2, horiz = TRUE, main = enrich.types[i])    
}

KEGG

top.n <- 20

kegg.gsea <- vector(mode = "list", length = length(enrich.types))
names(kegg.gsea) <- enrich.types

for(tp in 1:length(enrich.types)){
    kegg.gsea[[tp]] <- run_gsea(de, enrich.types[tp], kegg.list)
}


#par(mar = c(4,12,2,2), mfrow = c(2,2))
#for(i in 1:length(bd.gsea)){
#    barplot(sort(kegg.gsea[[i]][,1], decreasing = TRUE)[1:top.n], las = 2, horiz = TRUE, main = combo.names[i])    
#}

common.names <- Reduce("intersect", lapply(kegg.gsea, rownames))
enrich.kegg <- matrix(NA, nrow = length(common.names), ncol = length(enrich.types))
rownames(enrich.kegg) <- common.names
colnames(enrich.kegg) <- enrich.types
for(i in 1:length(kegg.gsea)){
    enrich.kegg[common.names,i] <- kegg.gsea[[i]][common.names, 1]
}

#filter to rows that have at least one value above a threshold
high.idx <- which(apply(enrich.kegg, 1, max) > 1.5)

The following plots show the most strongly enriched KEGG pathways for genes that are up and down in VS relative to FC.

for(i in 1:ncol(enrich.kegg)){
    par(mar = c(4,24,2,2))
    top.kegg <- rev(sort(enrich.kegg[,i], decreasing = TRUE)[1:top.n])
    barplot(top.kegg, horiz = TRUE, las = 2, main = colnames(enrich.kegg)[i])
}

Biodomain-KEGG intersections

We found the intersection of each biodomain and KEGG pathway. We calculated the enrichment scores for each of these gene sets. The violin plots below show the enrichment scores for all KEGG pathways in each biodomain. There is fairly even enrichment across all biodomains for both directions of expression.

#This function take a list of gsea results with a positive and 
#negative sorting and plots them on one graph. The upregulated 
#stuff gets positive values and the downregulated stuff gets
#negative values
plot_gsea <- function(gsea_result, alt.allele){
  
    neg.idx <- which(names(gsea_result) == "neg")
    pos.idx <- which(names(gsea_result) == "pos")
    
    pos.mean <- sapply(gsea_result[[pos.idx]], median)
    neg.mean <- sapply(gsea_result[[neg.idx]], median)
    mean.diff <- pos.mean - neg.mean
    diff.order <- order(mean.diff)

    xmax <- max(unlist(gsea_result))
    xmin <- 0

    dir.names <- c(paste("up in", alt.allele), paste("down in", alt.allele))
    dir.col <- c("#fec44f", "#9ecae1")
    names(dir.col) <- dir.names

    par(mar = c(4,12,2,2))
    plot.new()
    plot.window(xlim = c(xmin, xmax), ylim = c(1, (length(gsea_result[[1]])*3)))
    axis(1)

    par(xpd = NA)
    plot.idx <- 1
    for(i in diff.order){
        pos.vals <- gsea_result[[pos.idx]][[i]]
        neg.vals <- gsea_result[[neg.idx]][[i]]

        if(length(neg.vals) > 0){
            vioplot(neg.vals, at = (plot.idx*2)-0.5, col = dir.col[grep("down", dir.names)], 
                add = TRUE, horizontal = TRUE, frame.plot = FALSE)
            #stripchart(neg.vals, method = "jitter", pch = 16, col = "darkgray", add = TRUE) 
        }

        if(length(pos.vals) > 0){
        vioplot(pos.vals, at = (plot.idx*2)+0.5, col = dir.col[grep("up", dir.names)], 
                add = TRUE, horizontal = TRUE, frame.plot = FALSE)
            #stripchart(pos.vals, method = "jitter", pch = 16, col = "darkgray", add = TRUE) 
        }

        draw.rectangle(xmin, xmax, (plot.idx*2)-1, (plot.idx*2)+1)

        text(x = xmin, y = plot.idx*2, names(gsea_result[[pos.idx]])[i], adj = 1)
        plot.idx = plot.idx + 1.5
    }
    mtext("Normalized Enrichment Score", side = 1, line = 2.5)
    par(xpd = TRUE)

    plot.dim = par("usr")
    legend(plot.dim[3], plot.dim[1], fill = dir.col, legend = names(dir.col))
}
intersect.gsea <- vector(mode = "list", length = length(enrich.types))
names(intersect.gsea) <- enrich.types

for(tp in 1:length(enrich.types)){
    test <- lapply(intersect.list, function(x) run_gsea(de, enrich.types[tp], x))
    nes <- sapply(test, function(x) x[,"norm.es"])
    intersect.gsea[[tp]] <- nes
}

#pdf("~/Desktop/gsea.pdf", width = 6, height = 12)
plot_gsea(gsea_result = intersect.gsea, vs.allele)

#dev.off()
unlisted.gsea <- intersect.gsea
for(i in 1:length(unlisted.gsea)){
    unlisted.gsea[[i]] <- unlist(intersect.gsea[[i]], recursive = FALSE)
}
common.names <- Reduce("intersect", lapply(unlisted.gsea, names))
enrich.intersect <- matrix(NA, nrow = length(common.names), ncol = 2)
rownames(enrich.intersect) <- common.names
colnames(enrich.intersect) <- enrich.types
for(i in 1:length(intersect.gsea)){
    enrich.intersect[common.names,i] <- unlisted.gsea[[i]][common.names]
}

The plots below show the top enriched Biodomain-KEGG intersections overall.

#pdf("~/Desktop/top_enrich.pdf", width = 9, height = 5)
for(i in 1:ncol(enrich.intersect)){
    top.terms <- sort(enrich.intersect[,i], decreasing = TRUE)[1:top.n]
    par(mar = c(4, 25, 2, 2))
    barplot(rev(top.terms), horiz = TRUE, las = 2, main = colnames(enrich.intersect)[i])
}

#dev.off()
#playing around with ranking domains based on how many 
#terms are in the top n
#This can tell you which biodomains are represented in
#the top n terms, and how they are ranked.
#care about the top n terms among those terms.
#but changing n really changes the term representation
#and ranking, so it is very subjective
#apply(enrich.intersect, 2, function(x) length(which(x > 1.8)))
n.top = 10
par(mfrow = c(1,2))
for(i in 1:ncol(enrich.intersect)){
    ordered.terms <- sort(enrich.intersect[,i], decreasing = TRUE)
    split.terms <- strsplit(names(ordered.terms)[1:n.top], ".", fixed = TRUE)
    ordered.bd <- sapply(split.terms, function(x) x[1])
    bd.rank <- lapply(names(bd.list), function(x) which(ordered.bd == x))
    names(bd.rank) <- names(bd.list)
    top_big <- lapply(bd.rank, function(x) n.top-x)
    rank.order <- order(sapply(top_big, function(x) if(length(x) > 0){median(x)}else{0}))
    par(mar = c(4,14, 4, 2))
    boxplot(top_big[rank.order], las = 2, horizontal = TRUE, main = colnames(enrich.intersect)[i]) 
}
mtext(paste("Top", n.top, "terms"), side = 3, outer = TRUE, line = -1.5)

Biodomain-GO intersections

We also calculated intersections between GO terms and biodomains.

go.intersect.gsea <- vector(mode = "list", length = length(enrich.types))
names(go.intersect.gsea) <- enrich.types

for(tp in 1:length(enrich.types)){
    test <- lapply(bd.go.list, function(x) run_gsea(de, enrich.types[tp], x))
    nes <- sapply(test, function(x) x[,"norm.es"])
    go.intersect.gsea[[tp]] <- nes
}

#pdf("~/Desktop/GO.pdf", width = 7, height = 12)
plot_gsea(gsea_result = go.intersect.gsea, alt.allele = vs.allele)

#dev.off()
unlisted.go.gsea <- go.intersect.gsea
for(i in 1:length(unlisted.go.gsea)){
    unlisted.go.gsea[[i]] <- unlist(go.intersect.gsea[[i]], recursive = FALSE)
}
common.names <- Reduce("intersect", lapply(unlisted.go.gsea, names))
go.enrich.intersect <- matrix(NA, nrow = length(common.names), ncol = 2)
rownames(go.enrich.intersect) <- common.names
colnames(go.enrich.intersect) <- enrich.types
for(i in 1:length(intersect.gsea)){
    go.enrich.intersect[common.names,i] <- unlisted.go.gsea[[i]][common.names]
}

The plots below show the top enriched Biodomain-GO term intersections overall.

#pdf("~/Desktop/top_enrich.pdf", width = 9, height = 5)
for(i in 1:ncol(go.enrich.intersect)){
    top.terms <- sort(go.enrich.intersect[,i], decreasing = TRUE)[1:top.n]
    par(mar = c(4, 25, 2, 2))
    barplot(rev(top.terms), horiz = TRUE, las = 2, main = colnames(go.enrich.intersect)[i])
}

#dev.off()

Expression Networks

For both the KEGG-biodomain and GO-biodomain intersections we generated matrices in which the rows are biodomains, and the columns are KEGG pathways or GO terms. The entry in each cell is the difference in mean expression between the VS animals and FC animals such that a positive value indicates higher expression in the VS animals and a negative value indicates lower expression in the VS animals.

expr_mat <- function(gene.list, expr.means){

    row.terms <- names(gene.list)
    col.terms <- Reduce("union", lapply(gene.list, names))

    intersect.mat <- matrix(0, nrow = length(row.terms), ncol = length(col.terms))
    rownames(intersect.mat) <- row.terms
    colnames(intersect.mat) <- col.terms

    vs.col <- intersect(grep("Mean", colnames(expr.means)), grep("VS", colnames(expr.means)))
    fc.col <- intersect(grep("Mean", colnames(expr.means)), grep("FC", colnames(expr.means)))

    for(bd in 1:length(row.terms)){
        row.name <- row.terms[bd]
        common.col <- intersect(names(gene.list[[bd]]), col.terms)
        if(length(common.col) == 0){next()}
        for(k in 1:length(common.col)){
            col.name <- common.col[k]
            gene.group <- gene.list[[bd]][[k]]
            common.genes <- intersect(gene.group, rownames(expr.means))
            if(length(common.genes) > 0){
                vs.expr <- expr.means[common.genes, vs.col]
                fc.expr <- expr.means[common.genes, fc.col]
                #boxplot(list("VS" = vs.expr, "FC" = fc.expr), main = paste(row.name, col.name))
                intersect.mat[row.name, col.name] <- mean(fc.expr) - mean(vs.expr)
            }
        }
    }
    return(intersect.mat)
}



#This function actuall tests differential expression
#of intersection terms. It returns a matrix of p values.
diff_expr_intersect <- function(gene.list, expr.mat, covar.mat, ref.allele, alt.allele){
    
    row.terms <- names(gene.list)
    col.terms <- Reduce("union", lapply(gene.list, names))

    intersect.mat <- matrix(NA, nrow = length(row.terms), ncol = length(col.terms))
    rownames(intersect.mat) <- row.terms
    colnames(intersect.mat) <- col.terms

    dummy.mat <- dummy_covar(covar.mat[,c("sequencingBatch", "sex_ge")])
    adj.expr <- adjust(t(expr.mat), dummy.mat)

    ref.idx <- grep(ref.allele, covar.mat[,"genotype"])
    alt.idx <- grep(alt.allele, covar.mat[,"genotype"])

    for(bd in 1:length(row.terms)){
        k.terms <- names(gene.list[[bd]])
        if(length(k.terms) == 0){next()}
        for(k in 1:length(k.terms)){
            test.genes <- gene.list[[bd]][[k]]
            common.genes <- intersect(rownames(expr.mat), test.genes)
            if(length(common.genes) > min.set.n){
                test.mats <- list(adj.expr[ref.idx, common.genes], adj.expr[alt.idx, common.genes])
                #boxplot(test.mats)
                #vioplot(lapply(test.mats, colMeans), method = "jitter", vertical = TRUE)
                #test.result <- t.test(test.mats[[1]], test.mats[[2]])
                test.result <- t.test(colMeans(test.mats[[1]]), colMeans(test.mats[[2]]))
                intersect.mat[row.terms[bd], k.terms[k]] <- test.result$p.value
           }
        }
    }
    return(intersect.mat)
}

KEGG-Biodomain Intersections

The plot below shows the differential expression vs. the -log10(p) for each KEGG-Biodomain intersection. Significant intersections are shown in red. These have an FDR adjusted p value less than 0.05.

kegg.bd.mat <- expr_mat(gene.list = intersect.list, expr.means = vs.diff.expr)
kegg.bd.p <- diff_expr_intersect(gene.list = intersect.list, expr.mat = scaled.expr, 
    covar.mat = info, ref.allele = fc.allele, alt.allele = vs.allele)

kegg.adj.p <- p.adjust(kegg.bd.p)
kegg.sig.p <- as.vector(kegg.bd.p)[which(kegg.adj.p < 0.05)]
#max(kegg.sig.p)
kegg.sig.idx <- which(kegg.bd.p <= max(kegg.sig.p))
kegg.sig.col <- rep("black", length(kegg.bd.p))
kegg.sig.col[kegg.sig.idx] <- "red"
plot(kegg.bd.mat, -log10(kegg.bd.p), xlab = "KEGG-Biodomain Differential Expression",
    ylab = "-log10(p)", col = sig.col, pch = 16)

The following heat map shows the differential expression for just KEGG-biodomain intersections that were significant. Red values indicate KEGG-biodomain intersections that were more highly expressed in VS mice, and blue values indicate KEGG-biodomain intersections that had reduced expression in VS mice.

These are a bit difficult to interpret because they are kind of all over the place. Each column tells a story

kegg.sig.row.col <- idx_to_row_col(which(kegg.adj.p <= 0.05), nrow(kegg.bd.p))

sig.kegg.mat <- kegg.bd.mat[sort(unique(kegg.sig.row.col[,1])), sort(unique(kegg.sig.row.col[,2]))]

#pdf("~/Desktop/kegg_bd_mat.pdf", width = 8, height = 14)
if(sum(dim(sig.kegg.mat)) > 4){
    pheatmap(t(sig.kegg.mat))
}
#dev.off()

#plot.decomp(kegg.bd.mat[,unique(sig.mat[,2])])
#plot.decomp(t(kegg.bd.mat[,unique(sig.mat[,2])]), pc = 4)
#decomposing this matrix doesn't make sense because
#the GO terms are mostly non-overlapping across biodomains
#but we can still use the matrix to investigate individual
#terms
#If return.data is TRUE, data matrices are returned invisibly. 
#Otherwise gene lists are returned
#gene.set is an optional vector of gene names. If specified
#row.num and col.num are ignored and values are retrieved
#for the specified genes. Otherwise, if row.num and col.num
#are specified, the gene set is defined by the biodomain and
#intersecting term.

plot_intersect_expr <- function(intersect.mat, row.num, col.num, 
    gene.set = NULL, gene.list, 
    expr.mat, covar.table, ref.allele, alt.allele, label.srt = 0, las = 2,
    plot.by = c("gene", "group", "overall"), return.data = FALSE){

    plot.by <- plot.by[1]

    if(is.null(gene.set)){
        outer.name <- rownames(intersect.mat)[row.num]
        inner.name <- colnames(intersect.mat)[col.num]
        outer.idx <- which(names(gene.list) == outer.name)
        inner.idx <- which(names(gene.list[[outer.idx]]) == inner.name)
        gene.set <- gene.list[[outer.idx]][[inner.idx]]
    }else{
        outer.name <- "Genes specified"
        inner.name <- "by hand"
    }

    common.genes <- intersect(gene.set, rownames(expr.mat))

    plot.name = paste(outer.name, inner.name, sep = "\n")

    ref.idx <- grep(ref.allele, covar.table[,"genotype"])
    alt.idx <- grep(alt.allele, covar.table[,"genotype"])
    covar.mat <- dummy_covar(covar.table[,c("sequencingBatch", "sex_ge")])
    adj.expr <- adjust(t(expr.mat[common.genes,,drop=FALSE]), covar.mat)
    
    result <- list(adj.expr[ref.idx,,drop=FALSE], adj.expr[alt.idx,,drop=FALSE])
    names(result) <- c(ref.allele, alt.allele)

    #turn inside out
    by.gene <- vector(mode = "list", length = length(common.genes))
    names(by.gene) <- common.genes
    by.gene <- lapply(common.genes, function(x) sapply(result, function(y) y[,x]))
    gene.names <- sapply(common.genes, function(x) mouse.genes[which(mouse.genes[,"ensembl_gene_id"] == x)[1], "external_gene_name"])
    names(by.gene) <- gene.names

    gene.table <- cbind(rep(outer.name, length(common.genes)), 
        rep(inner.name, length(common.genes)), common.genes, gene.names)
    colnames(gene.table) <- c("Biodomain", "Intersecting_Term", "GeneID", "Gene_Name")

    for(i in 1:length(result)){
        colnames(result[[i]]) <- gene.names
    }

    if(is.na(plot.by)){
        if(return.data){
            return(result)
        }else{
            return(gene.table)
        }
    }

    if(plot.by == "gene"){
        #order by mean diff
        #order.val <- sapply(1:ncol(result[[1]]), function(x) mean(result[[1]][,x]) - mean(result[[2]][,x]))
        #order by mean of the reference allele
        list.idx <- which(names(result) == ref.allele)
        order.val <- colMeans(result[[list.idx]])
        ordered.result <- result
        for(i in 1:length(result)){
            ordered.result[[i]] <- result[[i]][,order(order.val),drop=FALSE]
        }
        plot.grouped.boxes(ordered.result, type = "matrix", main = plot.name, label.srt = label.srt, print.vals = NA)
        plot.dim <- par("usr")
        segments(plot.dim[1], 0, plot.dim[2], 0)
 
    }
    if(plot.by == "group"){
        #order by mean of VS allele
        list.idx <- which(colnames(by.gene[[1]]) == ref.allele)
        order.val <- sapply(by.gene, function(x) mean(x[,list.idx]))
        plot.grouped.boxes(by.gene[order(order.val)], type = "matrix", main = plot.name, label.srt = label.srt, print.vals = NA)
        plot.dim <- par("usr")
        segments(plot.dim[1], 0, plot.dim[2], 0)
    }
    if(plot.by == "overall"){
        ymin <- min(unlist(by.gene))
        ymax <- max(unlist(by.gene))

        #order by alt allele
        list.idx <- which(names(result) == ref.allele)
        order.val <- colMeans(result[[list.idx]])
        ordered.result <- result
        for(i in 1:length(result)){
            ordered.result[[i]] <- result[[i]][,order(order.val),drop=FALSE]
        }

        par(mfrow = c(2,2))
        boxplot(ordered.result[[1]], ylim = c(ymin, ymax), main = names(result)[1], las = las)
        plot.dim <- par("usr")
        segments(plot.dim[1], 0, plot.dim[2], 0)
        boxplot(ordered.result[[2]], ylim = c(ymin, ymax), main = names(result)[2], las = las)
        segments(plot.dim[1], 0, plot.dim[2], 0)

        group.expr <- list(unlist(ordered.result[[1]]), unlist(ordered.result[[2]]))
        group.means <- lapply(group.expr, colMeans)
        #do we test individuals or means?
        #group.test <- t.test(group.expr[[1]], group.expr[[2]])
        if(length(group.means[[1]]) > 3){
            group.test <- t.test(group.means[[1]], group.means[[2]])
            test.p <- group.test$p.value
        }else{
            test.p <- NA
        }
        boxplot(group.means, names = names(result), main = paste0("Expression Means by Group\n", "p = ", signif(test.p, 2)))
        stripchart(group.means, add = TRUE, vertical = TRUE, method = "jitter", col = "#dd1c77", pch = 16)
        plot.dim <- par("usr")
        segments(plot.dim[1], 0, plot.dim[2], 0)

        #plot.text(outer.name, x = 0.5, y = 0.75, cex = 1.5)
        #plot.text(words_to_paragraph(words = strsplit(inner.name, " ")[[1]], sep = " ", line.len = 3), add = TRUE)

        if(length(group.means[[1]]) > 3){
        plot.with.model(colMeans(result[[1]]), colMeans(result[[2]]), 
            xlab = paste(ref.allele, "Means"), ylab = paste(alt.allele, "Means"), 
            report = "cor.test")
        }
        
        main.text <- words_to_paragraph(words = c(outer.name, strsplit(inner.name, " ")[[1]]), 
            sep = " ", line.len = 3)
        mtext(main.text, outer = TRUE, side = 3, line = -1.5)

    }
    if(return.data){
        invisible(result)
    }else{
        invisible(gene.table)
    }
}

plot_set <- function(result.mat, row.col.ind, gene.list, plot.by = c("gene", "group", "overall"), 
    file_path = ".", file_tag = ""){

    high.terms <- lapply(rownames(result.mat), 
        function(x) colnames(result.mat)[row.col.ind[which(rownames(result.mat)[row.col.ind[,1]] == x),2]])
    names(high.terms) <- rownames(result.mat)
    num.terms <- sapply(high.terms, length)
    has.terms <- which(num.terms > 0)
    #par(mar = c(4,16,2,2))
    #barplot(sort(sapply(high.terms, length)), las = 2, horiz = TRUE)
    for(bd in has.terms){
        bd.name <- rownames(result.mat)[bd]
        high.bd <- row.col.ind[which(row.col.ind[,1] == bd),,drop=FALSE]
        
        gene.tables <- vector(mode = "list", length = nrow(high.bd))

        pdf(file.path(file_path, paste0(file_tag, "_", bd.name, "_Expression.pdf")))
        for(g in 1:nrow(high.bd)){
            gene.tables[[g]] <- plot_intersect_expr(intersect.mat = result.mat, row.num = high.bd[g,1], 
                col.num = high.bd[g,2], gene.set = NULL, gene.list = gene.list, 
                expr.mat = scaled.expr, covar.table = info, ref.allele = fc.allele, 
                alt.allele = vs.allele, plot.by = plot.by, label.srt = 90, las = 2,
                return.data = FALSE)

        }
        dev.off()

        bd.gene.table <- Reduce("rbind", gene.tables)
        
        write.table(bd.gene.table, file.path(file_path, paste0(file_tag, "_", bd.name, "_Expression.txt")),
            sep = "\t", quote = FALSE, row.names = FALSE)

    }
}
kegg_expr_dir <- file.path(results.dir, paste0("KEGG_Expression_", comparison.type))
if(!file.exists(kegg_expr_dir)){dir.create(kegg_expr_dir)}

thresh.percentile = 90
kegg.high.thresh <- get.percentile(kegg.bd.mat[which(kegg.bd.mat > 0)], thresh.percentile)
kegg.low.thresh <- get.percentile(kegg.bd.mat[which(kegg.bd.mat < 0)], thresh.percentile)
kegg.sig.high <- idx_to_row_col(intersect(which(kegg.bd.p <= (max(kegg.sig.p))), which(kegg.bd.mat < kegg.low.thresh)), nrow(kegg.bd.p))
#nrow(kegg.sig.high)
kegg.sig.low <- idx_to_row_col(intersect(which(kegg.bd.p <= (max(kegg.sig.p))), which(kegg.bd.mat > kegg.high.thresh)), nrow(kegg.bd.p))
#nrow(kegg.sig.low)
if(nrow(kegg.sig.high) > 0){
    plot_set(result.mat = kegg.bd.mat, row.col.ind = kegg.sig.high, gene.list = intersect.list,
        plot.by = "overall", file_path = kegg_expr_dir, file_tag = "KEGG-BD_intersection_high")
}
if(nrow(kegg.sig.low) > 0){
    plot_set(result.mat = kegg.bd.mat, row.col.ind = kegg.sig.low, gene.list = intersect.list,
        plot.by = "overall", file_path = kegg_expr_dir, file_tag = "KEGG-BD_intersection_low")
}

GO-biodomain Intersections

go.bd.mat <- expr_mat(bd.go.list, vs.diff.expr)
go.bd.p <- diff_expr_intersect(bd.go.list, scaled.expr, info, fc.allele, vs.allele)
go.adj.p <- p.adjust(go.bd.p)
go.sig.p <- as.vector(go.bd.p)[which(go.adj.p < 0.05)]
#max(go.sig.p)
go.sig.idx <- which(go.bd.p <= max(go.sig.p))
go.sig.col <- rep("black", length(go.bd.p))
go.sig.col[go.sig.idx] <- "red"
plot(go.bd.mat, -log10(go.bd.p), col = go.sig.col, pch = 16)

#pdf("~/Desktop/go_bd_mat.pdf", width = 15, height = 500)
#pheatmap(t(go.bd.mat))
#dev.off()

#what are the top differentially expressed terms?
#plot(sort(as.vector(go.bd.mat)))

go_expr_dir <- file.path(results.dir, paste0("GO_Expression_", comparison.type))
if(!file.exists(go_expr_dir)){dir.create(go_expr_dir)}

thresh.percentile = 90
go.high.thresh <- get.percentile(go.bd.mat[which(go.bd.mat > 0)], thresh.percentile)
go.low.thresh <- get.percentile(go.bd.mat[which(go.bd.mat < 0)], thresh.percentile)
go.sig.high <- idx_to_row_col(intersect(which(go.bd.p <= (max(go.sig.p))), which(go.bd.mat < -0.5)), nrow(go.bd.p))
#nrow(go.sig.high)
go.sig.low <- idx_to_row_col(intersect(which(go.bd.p <= (max(go.sig.p))), which(go.bd.mat > 0.5)), nrow(go.bd.p))
#nrow(go.sig.low)
if(nrow(go.sig.high) > 0){
    plot_set(result.mat = go.bd.mat, row.col.ind = go.sig.high, gene.list = bd.go.list,
        plot.by = "overall", file_path = go_expr_dir, file_tag = "GO-BD_intersection_high")
}

if(nrow(go.sig.low) > 0){
    plot_set(result.mat = go.bd.mat, row.col.ind = go.sig.low, gene.list = bd.go.list,
        plot.by = "overall", file_path = go_expr_dir, file_tag = "GO-BD_intersection_low")
}
i = 2
sig.ind <- which(go.bd.p[i,] <= max(sig.p))
sig.col <- rep("black", ncol(go.bd.p))
sig.col[sig.ind] <- "red"
plot(go.bd.mat[i,], col = sig.col, main = rownames(go.bd.mat)[i])

The following heat map shows the GO terms that have significant differential expression between the FC animals and the VS animals that had in the top 90% difference in expression between the two alleles.

Red values indicate that expression is higher in the VS animals and blue values indiate the expression is lower in the VS animals.

sub_not_sig <- function(vals, pvals, sig.thresh){
    newV <- rep(NA, length(vals))
    sig.idx <- which(pvals <= sig.thresh)
    newV[sig.idx] <- vals[sig.idx]
    return(newV)
}

go.all.sig <- t(sapply(1:nrow(go.bd.mat), function(x) sub_not_sig(go.bd.mat[x,], go.bd.p[x,], max(go.sig.p))))
has.row.vals <- which(apply(go.all.sig, 1, function(x) !all(is.na(x))))
has.col.vals <- which(apply(go.all.sig, 2, function(x) !all(is.na(x))))
sub.mat <- t(go.bd.mat[has.row.vals, has.col.vals])
pheatmap(sub.mat, cluster_rows = FALSE, cluster_cols = FALSE)
kegg.gene.lists <- list.files(kegg_expr_dir, pattern = ".txt", full.names = TRUE)
kegg.gene.tables <- Reduce("rbind", lapply(kegg.gene.lists, function(x) read.table(x, header = TRUE, sep = "\t")))
u_pairs <- unique(cbind(kegg.gene.tables[,"Biodomain"], kegg.gene.tables[,"Intersecting_Term"]))
pair.names <- apply(u_pairs, 1, function(x) paste(x, collapse = " - "))

#make a list of all genes for all unique pairs of biodomains and kegg terms
inter.gene.list <- vector(mode = "list", length = nrow(u_pairs))
names(inter.gene.list) <- pair.names
for(i in 1:nrow(u_pairs)){
    bd.idx <- which(kegg.gene.tables[,"Biodomain"] == u_pairs[i,1])
    k.idx <- which(kegg.gene.tables[,"Intersecting_Term"] == u_pairs[i,2])
    pair.idx <- intersect(bd.idx, k.idx)
    inter.gene.list[[i]] <- kegg.gene.tables[pair.idx,"GeneID"]
}


kegg.jmat <- jaccard.matrix(inter.gene.list)

pdf("~/Desktop/jaccard.pdf", width = 15, height = 15)
pheatmap(kegg.jmat)
dev.off()

ku_gene <- unique(kegg.gene.tables[,"GeneID"])

all_kegg <- plot_intersect_expr(intersect.mat = kegg.bd.mat, 
    gene.set = ku_gene, gene.list = intersect.list, 
    expr.mat = scaled.expr, covar.table = info, ref.allele = fc.allele, 
    alt.allele = vs.allele, plot.by = "overall", label.srt = 90, las = 2,
    return.data = TRUE)
group.cols <- c(rep("#7fc97f", nrow(all_kegg[[1]])), rep("#beaed4", nrow(all_kegg[[2]])))
kegg_mat <- Reduce("rbind", all_kegg)
plot.decomp(kegg_mat, cols = group.cols, pc = 4) 

geno.df <- data.frame(as.factor(info[rownames(kegg_mat), "genotype"]))
colnames(geno.df) <- "genotype"
rownames(geno.df) <- rownames(kegg_mat)
row.order <- order(geno.df[,1])

test <- pam(kegg_mat, k = 2)
row.order <- order(test$cluster)
pheatmap(kegg_mat[row.order,], cluster_rows = FALSE, scale = "column", annotation_row = geno.df)

test_dim_red(t(kegg_mat), col = group.cols, pdf.filename = "~/Desktop/test_KEGG.pdf")
#plots a result from plot_intersect_expr
plot_result_decomp <- function(intersect_expr, pc = 2){
    group.cols <- c(rep("#7fc97f", nrow(intersect_expr[[1]])), rep("#beaed4", nrow(intersect_expr[[2]])))
    result_mat <- Reduce("rbind", intersect_expr)
    plot.decomp(result_mat, cols = group.cols, pc = pc) 
    invisible(result_mat)
}

#takes the matrix from plot_result_decomp
#and clusters it
plot_result_clusters <- function(result_mat, k = 2, order.by = c("genotype", "clusters")){
    
    order.by = order.by[1]
    geno.df <- data.frame(as.factor(info[rownames(result_mat), "genotype"]))
    colnames(geno.df) <- "genotype"
    rownames(geno.df) <- rownames(go_mat)
    
    if(order.by == "genotype"){
        row.order <- unlist(lapply(c("FC/FC", "WT/FC", "WT/VS", "VS/VS"), function(x) which(geno.df[,1] == x)))
    }

    test <- pam(result_mat, k = k)
    #test <- pam(plot.decomp(result_mat, plot.results = FALSE)$u, k = k)
    clust <- test$cluster
    clust_df <- data.frame(as.factor(clust))
    colnames(clust_df) <- "cluster"
    if(order.by == "clusters"){
        row.order <- order(test$cluster)
    }
    all.df <- data.frame(geno.df, clust_df)
    pheatmap(result_mat[row.order,], cluster_rows = FALSE, scale = "column", 
        annotation_row = all.df, show_colnames = FALSE)
}
go.gene.lists <- list.files(go_expr_dir, pattern = ".txt", full.names = TRUE)
go.gene.tables <- Reduce("rbind", lapply(go.gene.lists, function(x) read.table(x, header = TRUE, sep = "\t")))
u_pairs <- unique(cbind(go.gene.tables[,"Biodomain"], go.gene.tables[,"Intersecting_Term"]))
pair.names <- apply(u_pairs, 1, function(x) paste(x, collapse = " - "))

#make a list of all genes for all unique pairs of biodomains and kegg terms
inter.gene.list <- vector(mode = "list", length = nrow(u_pairs))
names(inter.gene.list) <- pair.names
for(i in 1:nrow(u_pairs)){
    bd.idx <- which(go.gene.tables[,"Biodomain"] == u_pairs[i,1])
    k.idx <- which(go.gene.tables[,"Intersecting_Term"] == u_pairs[i,2])
    pair.idx <- intersect(bd.idx, k.idx)
    inter.gene.list[[i]] <- go.gene.tables[pair.idx,"GeneID"]
}

go.jmat <- jaccard.matrix(inter.gene.list)

pdf("~/Desktop/jaccard_go.pdf", width = 15, height = 15)
pheatmap(go.jmat)
dev.off()

gu_gene <- unique(go.gene.tables[,"GeneID"])

all_go <- plot_intersect_expr(intersect.mat = go.bd.mat, 
    gene.set = gu_gene, gene.list = bd.go.list, 
    expr.mat = scaled.expr, covar.table = info, ref.allele = fc.allele, 
    alt.allele = vs.allele, plot.by = "overall", label.srt = 90, las = 2,
    return.data = TRUE)
go_mat <- plot_result_decomp(all_go, pc = 4)
plot_result_clusters(go_mat, k = 4)

#test_dim_red(t(go_mat), col = group.cols, pdf.filename = "~/Desktop/test_GO.pdf")
#KEGG
biodomain <- "Synapse"; kegg.p <- "Ribosome"
biodomain <- "Immune Response"; kegg.p <- "Ribosome"
biodomain <- "Proteostasis"; kegg.p <- "Ribosome"
biodomain <- "Oxidative Stress"; kegg.p <- "AMPK signaling pathway"
biodomain <- "Proteostasis"; kegg.p <- "Prion disease"
biodomain <- "Mitochondrial Metabolism"; kegg.p <- "Parkinson disease"
biodomain <- "Mitochondrial Metabolism"; kegg.p <- "Pathways of neurodegeneration - multiple diseases"

row.num <- which(rownames(kegg.bd.mat) == biodomain)
col.num <- which(colnames(kegg.bd.mat) == kegg.p)
ex_kegg <- plot_intersect_expr(intersect.mat = kegg.bd.mat, 
    row.num = row.num, col.num = col.num, gene.list = intersect.list, 
    expr.mat = scaled.expr, covar.table = info, ref.allele = fc.allele, 
    alt.allele = vs.allele, plot.by = "overall", label.srt = 90, las = 2,
    return.data = TRUE)
kegg_mat <- plot_result_decomp(ex_kegg, pc = 2)
plot_result_clusters(kegg_mat, k = 2)

#GO
biodomain <- "Mitochondrial Metabolism"; go.p <- "proton motive force-driven ATP synthesis"
biodomain <- "Lipid Metabolism"; go.p <- "phosphatidylinositol-3-phosphate biosynthetic process"
row.num <- which(rownames(go.bd.mat) == biodomain)
col.num <- which(colnames(go.bd.mat) == go.p)
ex_go <- plot_intersect_expr(intersect.mat = go.bd.mat, 
    row.num = row.num, col.num = col.num, gene.list = bd.go.list, 
    expr.mat = scaled.expr, covar.table = info, ref.allele = fc.allele, 
    alt.allele = vs.allele, plot.by = "overall", label.srt = 90, las = 2,
    return.data = TRUE)
go_mat <- plot_result_decomp(ex_go, pc = 2)
plot_result_clusters(go_mat, k = 2)
test.genes <- c("Rpl13a", "Ifng", "Rps3", "Rps6", "Rps19", "Rps25", "Rpl22")
test.genes <- c("Lrrk2") #Parkinson's gene. Elevated levels linked to neurodegeneration

gene.id <- mouse.genes[match(test.genes, mouse.genes[,"external_gene_name"]), "ensembl_gene_id"]
names(gene.id) <- test.genes
gene.id <- gene.id[which(!is.na(gene.id))]


#p.order <- order(vs.diff.expr[,"p"], decreasing = FALSE)
#head(vs.diff.expr[p.order,])
#gene.id <- rownames(vs.diff.expr[p.order,])[1:500]
#gene.names <- mouse.genes[match(test.genes, mouse.genes[,"ensembl_gene_id"]), "external_gene_name"]
#names(gene.id) <- gene.names

test_set <- plot_intersect_expr(intersect.mat = kegg.bd.mat, 
    gene.set = gene.id, gene.list = intersect.list, 
    expr.mat = scaled.expr, covar.table = info, ref.allele = fc.allele, 
    alt.allele = vs.allele, plot.by = "overall", label.srt = 90, las = 2,
    return.data = TRUE)

set_mat <- plot_result_decomp(test_set, pc = 2)
plot_result_clusters(set_mat, k = 2, order.by = "clusters")


test.var <- lapply(test_set, function(x) apply(x, 1, var))
vioplot(test.var)

We made a data object with the adjusted gene expression split into the comparison groups for further downstream analysis.

all.gene.id <- rownames(scaled.expr)
downstream_set <- plot_intersect_expr(intersect.mat = kegg.bd.mat, 
    gene.set = all.gene.id, gene.list = intersect.list, 
    expr.mat = scaled.expr, covar.table = info, ref.allele = fc.allele, 
    alt.allele = vs.allele, plot.by = NA, label.srt = 90, las = 2,
    return.data = TRUE)

saveRDS(downstream_set, file.path(results.dir, 
    paste0("Adjusted_Expression_Separated_by_Group_", comparison.type, ".RDS")))